library(ggplot2)
pF <- read.csv('/Users/jigdelkuyee/Desktop/DataAnalyst/Udacity/IntroRStudio/EDA_Course_Materials/lesson3/pseudo_facebook.tsv', sep='\t')
qplot(age, friend_count, data=pF)
Not surprising to see younger demographics with higher number of friends. Also, people older than 30 have about >200 friends on average. ***
ggplot(aes(x=age, y=friend_count), data=pF) + geom_point() + xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x=age, y=friend_count), data=pF) + geom_jitter(alpha=1/20) + xlim(13, 90)
## Warning: Removed 5187 rows containing missing values (geom_point).
Notes:
ggplot(aes(x=age, y=friend_count), data=pF) + geom_point(alpha=1/20, position = position_jitter(h=0)) + xlim(13, 90) + coord_trans(y='sqrt')
## Warning: Removed 5169 rows containing missing values (geom_point).
``` #### What do you notice? See more clearly the friends count < 1000. ***
Notes:
ggplot(aes(x=age, y=friendships_initiated), data=subset(pF, !is.na(gender))) + geom_point(alpha=1/20, position = position_jitter(h=0)) + xlim(13, 90) + coord_trans(y='sqrt') + facet_wrap(~gender)
## Warning: Removed 2054 rows containing missing values (geom_point).
## Warning: Removed 3083 rows containing missing values (geom_point).
Not surprising to notice that younger males tend to initiate more friendships. ***
Notes:
#install.packages('dplyr')
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pF, age)
pF.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
head(pF.fc_by_age, 1)
## Source: local data frame [1 x 4]
##
## age friend_count_mean friend_count_median n
## 1 13 164.75 74 484
#Another way to implement - chain commands (like D3.js)
# pF.fc_by_age <- pF %>%
# group_by(age) %>%
# summarise(friend_count_mean = mean(friend_count),
# friend_count_median = median(friend_count),
# n= n())
# head(pF.fc_by_age,1)
Create your plot!
ggplot(aes(x=age, y=friend_count_mean), data=pF.fc_by_age) +
geom_line()
#fancy edit: http://docs.ggplot2.org/current/geom_line.html
Notes:
ggplot(aes(x=age, y=friend_count), data=pF) +
geom_point(alpha=1/20, position = position_jitter(h=0), color= 'orange') +
coord_cartesian(xlim = c(13, 90), ylim=c(0,1000)) +
geom_line(stat='summary', fun.y=mean) +
geom_line(stat='summary', fun.y=quantile, probs = .1, linetype=2, color='blue') +
geom_line(stat='summary', fun.y=quantile, probs = .9, linetype=2, color='blue') +
geom_line(stat='summary', fun.y=quantile, probs = .5, color='blue')
Notes:
#Look up the documentation for the cor.test function.
#?cor.test
cor.test(pF$age, pF$friend_count)
##
## Pearson's product-moment correlation
##
## data: pF$age and pF$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
# -0.027
#correlation has to be around -0.3 <corr< 0.3 to be meaningful
#using with()
with(pF, cor.test(age, friend_count, method='pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
What’s the correlation between age and friend count? Round to three decimal places. -0.17. Negative correlation, as age increases, friend counts decrease
with(subset(pF, age <70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.3261, df = 90664, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1775257 -0.1648889
## sample estimates:
## cor
## -0.1712144
A Visual Guide to Correlation Correlation Coefficient Intro to Inferential Statistics- Correlation
Correlation coefficients are often denoted with the greek letter rho, in addition to the letter r.
What are monotonic functions?
Correlation Methods: Pearson’s r, Spearman’s rho, and Kendall’s tau
Relationship between ‘likes_ received’ and www_likes_received
ggplot(aes(x=www_likes_received, y=likes_received), data=pF) +
geom_point(alpha=1/10) +
xlim(0, quantile(pF$www_likes_received, 0.95)) +
ylim(0, quantile(pF$www_likes, 0.95)) +
geom_smooth(method='lm', color='red')
## Warning: Removed 11608 rows containing missing values (stat_smooth).
## Warning: Removed 11608 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_path).
#coord_cartesian(xlim=c(0,5000), ylim=c(0,10000))
#coord_trans(xtrans="sqrt", ytrans="sqrt")
Notes:
with(pF, cor.test(www_likes_received, likes_received))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Notes:
#install.packages('alr3')
library(alr3)
## Loading required package: car
data(Mitchell)
head(Mitchell, 2)
## Month Temp
## 1 0 -5.18333
## 2 1 -1.65000
Create your plot!
cor.test(Mitchell$Month, Mitchell$Temp)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
ggplot(aes(x=Month, y=Temp), data=Mitchell) + geom_point() +
scale_x_discrete(breaks=seq(0,203,12))
cor.test(Mitchell$Month, Mitchell$Temp)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
ggplot(aes(x=(Month%%12), y=Temp), data=Mitchell) +
geom_point()
Notes:
pF$age_with_months <-pF$age+(12-pF$dob_month)/12
pF.fc_by_age_months <- pF %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n= n())
Programming Assignment
ggplot(data =subset(pF.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y=friend_count_mean)) + geom_line()
p1 = ggplot(data=subset(pF.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y=friend_count_mean)) + geom_line()
p2 = ggplot(data=subset(pF.fc_by_age, age < 71), aes(x=age, y=friend_count_mean)) +
geom_line()
p3 = ggplot(data=subset(pF, age <71), aes(x=round(age/5)*5, y=friend_count)) +
geom_line(stat='summary', fun.y=mean)
library(gridExtra)
## Loading required package: grid
grid.arrange(p1, p2, p3, ncol=1)
Notes:
ggplot(data=subset(pF.fc_by_age, age < 71), aes(x=age, y=friend_count_mean)) +
geom_line() +
geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
During EDA, try out different plots with the datasets and see which plot communicates your ideas in the most succinct and clear manner. That plot will one of the few which you will then share with your audience.